####################################################
### Sbert Cross Encoder #########
### Script: 03_cross_encode_sts.py
### Purpose: This code runs the bioweapons pairs
### pairs through the Sbert cross encoder
### Data In:
## potential matches
## data/potential_matches_bioweapons_cosine_sbert.rds
### Data Out:
### 100 data sets with cross encoder value saved as "cross_score"
### each with form
### data/cross_encoder/cross_encoder_bioweapons_cosine_sts_{item}.csv
## Notes: needs to be run as array job with arrays 0-99

from sentence_transformers import CrossEncoder
from tqdm import tqdm
import pandas as pd
import pyreadr
import numpy as np
import sys

item = sys.argv[1]
print(f"item value is {item}")
item = int(item)

## read in data
data = pyreadr.read_r("data/potential_matches_bioweapons_cosine_sbert.rds")
data = data[None]

# Split the DataFrame into 200 parts, extract array df
split_data = np.array_split(data, 200)
data_part = split_data[item]

## pre trained cross encoder model
model = CrossEncoder("cross-encoder/stsb-roberta-large")

## loop over potential matches to get cross encoder score 
data_part['cross_score'] = None
for i in range(len(data_part)):
    focal_summary = data_part.at[data_part.index[i],  'ego_summary']
    potential_summary = data_part.at[data_part.index[i],  'alter_summary']
    prediction = model.predict([[focal_summary, potential_summary]])
    data_part.at[data_part.index[i], 'cross_score'] = prediction[0]


data_part.to_csv(f"data/cross_encoder/cross_encoder_bioweapons_cosine_sts_{item}.csv", index=False)

